Leitura de arquivos individuais

Business

Clusterização dos estabelecimentos por semelhança de atributos

yelp_bz <- yelp_bz_raw %>% 
          select_if(~is.numeric(.)) %>% 
          mutate_all(~replace(., is.na(.), 0))

glimpse(yelp_bz)
## Rows: 14,962
## Columns: 34
## $ latitude                   <dbl> 43.62661, 43.64041, 43.61129, 43.70441, 43…
## $ longitude                  <dbl> -79.50209, -79.39058, -79.55687, -79.37511…
## $ review_count               <dbl> 4, 81, 3, 3, 4, 6, 10, 52, 14, 4, 4, 11, 7…
## $ stars                      <dbl> 2.0, 2.5, 1.0, 5.0, 3.0, 4.5, 3.0, 2.5, 3.…
## $ AcceptsInsurance           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ AgesAllowed                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Alcohol                    <dbl> 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ BYOB                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ BikeParking                <dbl> 2, 1, 0, 0, 0, 0, 2, 0, 2, 0, 0, 2, 2, 0, …
## $ BusinessAcceptsCreditCards <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ByAppointmentOnly          <dbl> 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Caters                     <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, …
## $ CoatCheck                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Corkage                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ DogsAllowed                <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ DriveThru                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ GoodForDancing             <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ GoodForKids                <dbl> 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, …
## $ HappyHour                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ HasTV                      <dbl> 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ NoiseLevel                 <dbl> 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ OutdoorSeating             <dbl> 0, 2, 0, 0, 0, 0, 2, 1, 2, 0, 0, 2, 1, 1, …
## $ RestaurantsAttire          <dbl> 0, 3, 0, 0, 0, 0, 3, 3, 3, 0, 0, 3, 3, 0, …
## $ RestaurantsDelivery        <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, …
## $ RestaurantsGoodForGroups   <dbl> 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, …
## $ RestaurantsPriceRange2     <dbl> 2, 2, 0, 0, 2, 0, 2, 1, 2, 0, 0, 1, 2, 0, …
## $ RestaurantsReservations    <dbl> 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, …
## $ RestaurantsTableService    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ RestaurantsTakeOut         <dbl> 0, 2, 0, 0, 0, 0, 2, 2, 2, 0, 0, 2, 2, 0, …
## $ Smoking                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ WheelchairAccessible       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, …
## $ WiFi                       <dbl> 0, 3, 0, 0, 0, 0, 3, 3, 3, 0, 0, 3, 3, 0, …
## $ tips_counter_bz            <dbl> 0, 14, 0, 0, 0, 1, 4, 5, 5, 0, 0, 6, 0, 1,…
## $ total_compliments_bz       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

PCA

Será aplicada uma análise de componentes principais para entender a variabilidade da nota dos estabelecimentos considerando seus atributos.

rec_pca <- recipe(stars ~ ., yelp_bz) %>% 
  update_role(contains('id'), new_role = 'id') %>% 
  #step_date(date_rv, yelping_since_usr, features = c("dow", "month","year")) %>% 
  #step_other(categories, threshold = 0.005) %>% 
  #step_other(postal_code, threshold = 0.01) %>% 
  #step_dummy(all_nominal(), -'business_id',-'user_id',-'name_bz') %>%
  step_normalize(all_numeric(), -all_outcomes()) %>% 
  step_pca(all_numeric(), -all_outcomes()) %>% 
  step_naomit(all_numeric()) %>% 
  prep()

yelp_bz_pca <- juice(rec_pca)

Scree plot

variance_pct <- rec_pca$steps[[2]]$res

(cumsum(variance_pct$sdev^2) / sum(variance_pct$sdev^2))
##  [1] 0.2795283 0.3569530 0.4181779 0.4760438 0.5246121 0.5650128 0.6019585
##  [8] 0.6384455 0.6714870 0.7020952 0.7307504 0.7590915 0.7806310 0.8013364
## [15] 0.8214466 0.8408425 0.8588443 0.8747742 0.8894284 0.9032762 0.9169327
## [22] 0.9301989 0.9408237 0.9507912 0.9598340 0.9678033 0.9751966 0.9818828
## [29] 0.9879589 0.9930369 0.9974065 1.0000000 1.0000000
fviz_eig(variance_pct, addlabels = TRUE) + 
  labs(x = "Componente Principal",
       y = "Percentual explicado da variância")
## Registered S3 methods overwritten by 'car':
##   method                          from
##   influence.merMod                lme4
##   cooks.distance.influence.merMod lme4
##   dfbeta.influence.merMod         lme4
##   dfbetas.influence.merMod        lme4

Mais de 50% da variabilidade é explicada pelas 5 primeiras componentes, que são compostas da seguinte forma:

Drivers

tidy_pca <- tidy(rec_pca, 2)

tidy_pca %>%
  filter(component %in% paste0("PC", 1:6)) %>%
  group_by(component) %>%
  top_n(15, abs(value)) %>%
  ungroup() %>%
  mutate(terms = reorder_within(terms, abs(value), component)) %>%
  ggplot(aes(abs(value), terms, fill = value > 0)) +
  geom_col() +
  facet_wrap(~component, scales = "free_y") +
  scale_y_reordered() +
  labs(
    x = "Valor absoluto da contribuição",
    y = NULL, fill = "Valor > 0")

Na primeira componente, os pesos são igualmente distribuídos, o que indica que todos os atributos tem impacto semelhante na maior parte da variabilidade.

Pela segunda componente, no entanto, observa-se que a existência de um local para deixar o casaco, ser permitido fumar e ser um bom local para dançar são mais relevante, assim como a localização (PC6). Além disso, cobrança de rolha e necessidade de levar a bebida também são drivers importantes, pois aparecem em mais de uma componente.

Contrastes

variance_pct %>% 
  fviz_pca_var(axes = c(1,2), col.var="contrib", gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"))

Os maiores contrastes são entre a modalidade de atendimento dos restaurante: Apenas delivery ou com reservas.

Users

Classificação dos usuários conforme o perfil.

K-Means

Será utilizada a clusterização k-médias por conta da quantidade de características dos usários presentes na base. Também foi tentada a aplicação de uma clusterização hierárquica, mas os resultados obtidos não foram tão interpretáveis como os seguintes.

set.seed(123)

glimpse(yelp_users)
## Rows: 119,792
## Columns: 23
## $ user_id            <chr> "-4Anvj46CWf57KWI9UQDLg", "-BUamlG3H-7yqpAl1p-msw"…
## $ average_stars      <dbl> 3.50, 1.50, 3.00, 3.56, 3.00, 4.00, 4.17, 3.57, 4.…
## $ compliment_cool    <dbl> 0, 0, 0, 0, 0, 0, 0, 169, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ compliment_cute    <dbl> 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ compliment_funny   <dbl> 0, 0, 0, 0, 0, 0, 0, 169, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ compliment_hot     <dbl> 0, 0, 0, 0, 0, 0, 0, 94, 0, 0, 0, 0, 0, 0, 0, 2, 0…
## $ compliment_list    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ compliment_more    <dbl> 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ compliment_note    <dbl> 0, 0, 1, 0, 0, 0, 0, 16, 0, 1, 0, 0, 0, 0, 0, 1, 0…
## $ compliment_photos  <dbl> 0, 0, 0, 0, 0, 0, 0, 97, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ compliment_plain   <dbl> 0, 0, 0, 0, 0, 0, 0, 66, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ compliment_profile <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ compliment_writer  <dbl> 0, 0, 0, 0, 0, 0, 0, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cool               <dbl> 2, 0, 1, 0, 1, 0, 0, 1562, 2, 1, 1, 9, 0, 5, 0, 9,…
## $ elite_count        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ fans               <dbl> 1, 0, 0, 0, 0, 0, 0, 39, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ friends_count      <dbl> 1, 16, 15, 27, 1, 1, 1, 338, 59, 6, 10, 100, 8, 1,…
## $ funny              <dbl> 0, 0, 1, 0, 0, 0, 0, 1266, 3, 1, 4, 0, 1, 1, 1, 5,…
## $ review_count_usr   <dbl> 2, 2, 4, 27, 2, 6, 6, 66, 28, 3, 8, 37, 4, 20, 1, …
## $ useful             <dbl> 2, 0, 1, 5, 1, 3, 16, 1683, 12, 1, 2, 30, 4, 30, 0…
## $ year_since         <dbl> 2016, 2016, 2011, 2019, 2014, 2017, 2014, 2019, 20…
## $ tips_counter       <dbl> 0, 1, 0, 0, 0, 1, 0, 0, 0, 19, 0, 0, 0, 0, 0, 2, 0…
## $ total_compliments  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
yelp_pad <- yelp_users %>% 
              select(-user_id) %>% 
              scale()

kclusts <- tibble(k = 1:30) %>%
  mutate(kclust = map(k, ~kmeans(yelp_pad, .x)),
        tidied = map(kclust, tidy),
        glanced = map(kclust, glance),
        augmented = map(kclust, augment, yelp_pad)
        )

clusters <- kclusts %>%
  unnest(cols = c(tidied))

assignments <- kclusts %>% 
  unnest(cols = c(augmented))

clusterings <- kclusts %>%
  unnest(cols = c(glanced))
### Cotovelo

clusterings %>% 
  ggplot(aes(k, tot.withinss)) + 
    geom_point(size = 3) + 
    geom_line() + 
    labs(y = "total within sum of squares", x = "k") +
    scale_x_continuous(breaks = 1:30)

Pelo gráfico do cotovelo, poderiam ser selecionado um número de clusters (k) de 11 a 17, a seguir é possível ver uma comparacão em relação às dicas.

#k-means
assignments %>% 
  filter(k %in% paste0(10:20)) %>%
  ggplot(aes(x = tips_counter, y = total_compliments)) +
  geom_point(aes(color = .cluster), alpha = 0.5) + 
  facet_wrap(~ k, nrow = 3)

Para a classificação final dos usuário, será feita novamente a clusterização, mas considerando apenas o k ideal.

set.seed(123)
kmeans_usr <-  kmeans(yelp_pad, 11)

yelp_usr_cluster <- yelp_users %>% 
          mutate(cluster_usr = kmeans_usr$cluster)

glimpse(yelp_usr_cluster)
## Rows: 119,792
## Columns: 24
## $ user_id            <chr> "-4Anvj46CWf57KWI9UQDLg", "-BUamlG3H-7yqpAl1p-msw"…
## $ average_stars      <dbl> 3.50, 1.50, 3.00, 3.56, 3.00, 4.00, 4.17, 3.57, 4.…
## $ compliment_cool    <dbl> 0, 0, 0, 0, 0, 0, 0, 169, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ compliment_cute    <dbl> 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ compliment_funny   <dbl> 0, 0, 0, 0, 0, 0, 0, 169, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ compliment_hot     <dbl> 0, 0, 0, 0, 0, 0, 0, 94, 0, 0, 0, 0, 0, 0, 0, 2, 0…
## $ compliment_list    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ compliment_more    <dbl> 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ compliment_note    <dbl> 0, 0, 1, 0, 0, 0, 0, 16, 0, 1, 0, 0, 0, 0, 0, 1, 0…
## $ compliment_photos  <dbl> 0, 0, 0, 0, 0, 0, 0, 97, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ compliment_plain   <dbl> 0, 0, 0, 0, 0, 0, 0, 66, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ compliment_profile <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ compliment_writer  <dbl> 0, 0, 0, 0, 0, 0, 0, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cool               <dbl> 2, 0, 1, 0, 1, 0, 0, 1562, 2, 1, 1, 9, 0, 5, 0, 9,…
## $ elite_count        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ fans               <dbl> 1, 0, 0, 0, 0, 0, 0, 39, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ friends_count      <dbl> 1, 16, 15, 27, 1, 1, 1, 338, 59, 6, 10, 100, 8, 1,…
## $ funny              <dbl> 0, 0, 1, 0, 0, 0, 0, 1266, 3, 1, 4, 0, 1, 1, 1, 5,…
## $ review_count_usr   <dbl> 2, 2, 4, 27, 2, 6, 6, 66, 28, 3, 8, 37, 4, 20, 1, …
## $ useful             <dbl> 2, 0, 1, 5, 1, 3, 16, 1683, 12, 1, 2, 30, 4, 30, 0…
## $ year_since         <dbl> 2016, 2016, 2011, 2019, 2014, 2017, 2014, 2019, 20…
## $ tips_counter       <dbl> 0, 1, 0, 0, 0, 1, 0, 0, 0, 19, 0, 0, 0, 0, 0, 2, 0…
## $ total_compliments  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ cluster_usr        <int> 3, 7, 10, 9, 3, 9, 4, 9, 4, 10, 3, 10, 4, 3, 7, 9,…

Pelo gráfico, observa-se claramente a divisão dos usuários em relação ao tempo na plataforma, a nota média e a quantidade de fãs.

plot_ly(yelp_usr_cluster, x = ~year_since, 
               y = ~average_stars,
               z = ~fans, color = ~cluster_usr,
              text = ~paste('Cluster: ', cluster_usr)) %>% 
  add_markers() %>% 
  layout(scene = list(xaxis = list(title = 'No Yelp desde'),
                                   yaxis = list(title = 'Nota Média'),
                                   zaxis = list(title = 'Quantidade de fãs')))
yelp_usr_cluster %>% 
          select(user_id, cluster_usr) %>%
          write.csv(file = "output/usr_cluster.csv")

Como próximos passos, seria interessante entnder melhor as características de cada cluster. Para classificar usuário que não estão na base, será utilizado um modelo de árvore para fazer a classificação. O intuito é obter de uma forma rápida o cluster de um novo usuário.

Modelo para definição do cluster do usuário

user_cluster_tree <- yelp_usr_cluster %>% 
                    select(-user_id) %>% 
                    rpart(cluster_usr ~ ., data = .)

plot_arvore <- as.party(user_cluster_tree)

#plot(plot_arvore)

Rede Neural

Leitura da base final

yelp_rv <- yelp_raw %>% 
  #mutate(line = row_number()) %>% 
  select(-'year_rv') %>% 
  mutate(stars_rv = replace(stars_rv >= 4,1,0)) %>% 
  select_if(is.numeric) #%>% sample_frac(0.50)

glimpse(yelp_rv)
## Rows: 219,462
## Columns: 58
## $ average_stars              <dbl> 1.00, 3.21, 1.50, 3.99, 4.23, 3.64, 3.67, …
## $ compliment_cool            <dbl> 0, 0, 0, 4, 2, 1, 0, 3, 0, 0, 0, 0, 1, 0, …
## $ compliment_cute            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ compliment_funny           <dbl> 0, 0, 0, 4, 2, 1, 0, 3, 0, 0, 0, 0, 1, 0, …
## $ compliment_hot             <dbl> 0, 0, 0, 2, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, …
## $ compliment_list            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ compliment_more            <dbl> 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, …
## $ compliment_note            <dbl> 0, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ compliment_photos          <dbl> 0, 0, 0, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ compliment_plain           <dbl> 0, 0, 0, 1, 2, 0, 0, 1, 0, 0, 1, 0, 1, 0, …
## $ compliment_profile         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ compliment_writer          <dbl> 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, …
## $ cool                       <dbl> 0, 2, 0, 35, 45, 1, 0, 37, 0, 0, 17, 0, 8,…
## $ elite_count                <dbl> 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, …
## $ fans                       <dbl> 0, 0, 0, 1, 7, 0, 0, 9, 0, 0, 4, 0, 1, 0, …
## $ friends_count              <dbl> 12, 1, 1, 44, 31, 8, 1, 147, 1, 20, 34, 1,…
## $ funny                      <dbl> 0, 1, 0, 18, 27, 0, 1, 20, 0, 0, 2, 0, 9, …
## $ review_count_usr           <dbl> 1, 19, 2, 105, 279, 14, 3, 190, 2, 4, 38, …
## $ useful                     <dbl> 0, 1, 1, 90, 99, 1, 1, 100, 1, 1, 19, 1, 5…
## $ year_since                 <dbl> 2017, 2016, 2017, 2011, 2017, 2017, 2015, …
## $ tips_counter               <dbl> 0, 0, 0, 1, 1, 5, 1, 9, 0, 1, 0, 0, 5, 0, …
## $ total_compliments          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cluster_usr                <dbl> 7, 3, 7, 10, 9, 9, 3, 4, 3, 7, 9, 7, 4, 3,…
## $ stars_rv                   <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ latitude                   <dbl> 43.64041, 43.64041, 43.64041, 43.64041, 43…
## $ longitude                  <dbl> -79.39058, -79.39058, -79.39058, -79.39058…
## $ review_count               <dbl> 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81…
## $ stars                      <dbl> 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.…
## $ AcceptsInsurance           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ AgesAllowed                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Alcohol                    <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ BYOB                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ BikeParking                <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ BusinessAcceptsCreditCards <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ByAppointmentOnly          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Caters                     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ CoatCheck                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Corkage                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ DogsAllowed                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ DriveThru                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ GoodForDancing             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ GoodForKids                <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ HappyHour                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ HasTV                      <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ NoiseLevel                 <dbl> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, …
## $ OutdoorSeating             <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ RestaurantsAttire          <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ RestaurantsDelivery        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ RestaurantsGoodForGroups   <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ RestaurantsPriceRange2     <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ RestaurantsReservations    <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ RestaurantsTableService    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ RestaurantsTakeOut         <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ Smoking                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ WheelchairAccessible       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ WiFi                       <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ tips_counter_bz            <dbl> 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14…
## $ total_compliments_bz       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

Bases de Treino e Teste

split <- initial_split(yelp_rv, prop = 0.8 , strata = stars_rv)

train_val <- training(split)


split_val <- initial_split(train_val, prop = 0.5, strata = stars_rv)

yelp_train <- training(split_val)
yelp_val <- testing(split_val)
yelp_test <- testing(split)
  • Normalização pela média e desvio padrão da base teste
mean <- yelp_train %>% 
        select(-stars_rv) %>% 
        apply(., 2, mean) 

std <- yelp_train %>% 
        select(-stars_rv) %>% 
        apply(., 2, sd)
x_train <- yelp_train %>% 
            select(-stars_rv) %>% 
            scale(center = mean, scale = std) %>% 
            as.matrix()

dim(x_train)
## [1] 87786    57
y_train <- yelp_train %>% 
            select(stars_rv) %>% 
            as.matrix()

x_val <-  yelp_val %>% 
            select(-stars_rv) %>% 
            scale(center = mean, scale = std) %>% 
            as.matrix()

dim(x_val)
## [1] 87785    57
y_val <- yelp_val %>% 
            select(stars_rv) %>% 
            data.matrix()

dim(x_val)
## [1] 87785    57
x_test <- yelp_test %>% 
          select(-stars_rv) %>% 
          scale(center = mean, scale = std) %>% 
          as.matrix()

dim(x_test) 
## [1] 43891    57
y_test <- yelp_test %>% 
            select(stars_rv) %>% 
            data.matrix()

Modelo

rm(yelp_nn)

yelp_nn <- keras_model_sequential() %>% 
  layer_dense(units = 30, activation = "tanh", input_shape = ncol(x_train)) %>%
  layer_dropout(rate = 0.5) %>%
  layer_dense(units = 16, activation = "relu") %>%
  #layer_dropout(rate = 0.5) %>%
  layer_dense(units = 16, activation = "relu") %>%
  #layer_dense(units = 6, activation = "softmax")
  layer_dense(units = 1, activation = "sigmoid")

yelp_nn %>% 
  compile(optimizer = "rmsprop", 
          #loss = "sparse_categorical_crossentropy", 
          loss = "binary_crossentropy",
          metrics = c("accuracy"))


history <- yelp_nn %>% 
  fit(x_train, y_train, 
      epochs = 40, batch_size = 512, 
      validation_data = list(x_val, y_val))

plot(history)
## `geom_smooth()` using formula 'y ~ x'

#keras::get_weights(yelp_nn)

(results <- yelp_nn %>% evaluate(x_test, y_test))
##      loss  accuracy 
## 0.4634990 0.7728692

Foi adicionada uma camada de dropout na rede neural, para diminuir o overfit do modelo. Observa-se que foi efeciente, pois a perda da base de validação não ultrappassa a perda da base de treino.

Desempenho do modelo

tibble(observado = factor(y_test)) %>% 
  bind_cols(data.frame(prob = predict(yelp_nn, as.matrix(x_test)))) %>% 
  roc_auc(observado, prob)
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.837
tibble(observado = factor(y_test)) %>% 
  bind_cols(data.frame(prob = predict(yelp_nn, as.matrix(x_test)))) %>% 
  roc_curve(observado, prob) %>% 
  autoplot()

Pelo gráfico, observa-se que o modelo atingiu um desempenho bom na base de teste.

Recomendação

Usuário criado

Criação de um usuário e definição de seu cluster

glimpse(yelp_usr_cluster)
## Rows: 119,792
## Columns: 24
## $ user_id            <chr> "-4Anvj46CWf57KWI9UQDLg", "-BUamlG3H-7yqpAl1p-msw"…
## $ average_stars      <dbl> 3.50, 1.50, 3.00, 3.56, 3.00, 4.00, 4.17, 3.57, 4.…
## $ compliment_cool    <dbl> 0, 0, 0, 0, 0, 0, 0, 169, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ compliment_cute    <dbl> 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ compliment_funny   <dbl> 0, 0, 0, 0, 0, 0, 0, 169, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ compliment_hot     <dbl> 0, 0, 0, 0, 0, 0, 0, 94, 0, 0, 0, 0, 0, 0, 0, 2, 0…
## $ compliment_list    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ compliment_more    <dbl> 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ compliment_note    <dbl> 0, 0, 1, 0, 0, 0, 0, 16, 0, 1, 0, 0, 0, 0, 0, 1, 0…
## $ compliment_photos  <dbl> 0, 0, 0, 0, 0, 0, 0, 97, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ compliment_plain   <dbl> 0, 0, 0, 0, 0, 0, 0, 66, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ compliment_profile <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ compliment_writer  <dbl> 0, 0, 0, 0, 0, 0, 0, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cool               <dbl> 2, 0, 1, 0, 1, 0, 0, 1562, 2, 1, 1, 9, 0, 5, 0, 9,…
## $ elite_count        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ fans               <dbl> 1, 0, 0, 0, 0, 0, 0, 39, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ friends_count      <dbl> 1, 16, 15, 27, 1, 1, 1, 338, 59, 6, 10, 100, 8, 1,…
## $ funny              <dbl> 0, 0, 1, 0, 0, 0, 0, 1266, 3, 1, 4, 0, 1, 1, 1, 5,…
## $ review_count_usr   <dbl> 2, 2, 4, 27, 2, 6, 6, 66, 28, 3, 8, 37, 4, 20, 1, …
## $ useful             <dbl> 2, 0, 1, 5, 1, 3, 16, 1683, 12, 1, 2, 30, 4, 30, 0…
## $ year_since         <dbl> 2016, 2016, 2011, 2019, 2014, 2017, 2014, 2019, 20…
## $ tips_counter       <dbl> 0, 1, 0, 0, 0, 1, 0, 0, 0, 19, 0, 0, 0, 0, 0, 2, 0…
## $ total_compliments  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ cluster_usr        <int> 3, 7, 10, 9, 3, 9, 4, 9, 4, 10, 3, 10, 4, 3, 7, 9,…
rm(user)

compliment_max <- 50

user <- tibble(user_id = 'random_user',
               average_stars = round(runif(1, 1.0, 5),2),
               compliment_cool = ceiling(runif(1,0, compliment_max)),
               compliment_cute = ceiling(runif(1,0, compliment_max)),
               compliment_funny = ceiling(runif(1,0, compliment_max)),
               compliment_hot  = ceiling(runif(1,0, compliment_max)),
               compliment_list = ceiling(runif(1,0, compliment_max)),
               compliment_more = ceiling(runif(1,0, compliment_max)),
               compliment_note = ceiling(runif(1,0, compliment_max)),
               compliment_photos = ceiling(runif(1,0, compliment_max)),
               compliment_plain = ceiling(runif(1,0, compliment_max)),
               compliment_profile = ceiling(runif(1,0, compliment_max)),
               compliment_writer = ceiling(runif(1,0, compliment_max)),
               cool = ceiling(runif(1,0, compliment_max)),
               elite_count = 0,
               fans = ceiling(runif(1,0, compliment_max)),
               friends_count = ceiling(runif(1,0, compliment_max)),
               funny = ceiling(runif(1,0, compliment_max)),
               review_count_usr = ceiling(runif(1,0,compliment_max)),
               useful = ceiling(runif(1,0, compliment_max)),
               year_since = ceiling(runif(1,2004, 2019)),
               tips_counter = ceiling(runif(1,0, compliment_max)),
               total_compliments = ceiling(runif(1,0, compliment_max))
                )

## criação aleatória do número de anos que o usuário foi elite
user$elite_count <- ceiling(runif(1,0, (2020-user$year_since)))

#encontra o número do cluster em que o usuário se encaixa
user$cluster_usr <- user_cluster_tree %>%
      predict(user) %>% 
      ceiling()
  
glimpse(user)
## Rows: 1
## Columns: 24
## $ user_id            <chr> "random_user"
## $ average_stars      <dbl> 4.07
## $ compliment_cool    <dbl> 14
## $ compliment_cute    <dbl> 17
## $ compliment_funny   <dbl> 2
## $ compliment_hot     <dbl> 14
## $ compliment_list    <dbl> 13
## $ compliment_more    <dbl> 19
## $ compliment_note    <dbl> 37
## $ compliment_photos  <dbl> 49
## $ compliment_plain   <dbl> 46
## $ compliment_profile <dbl> 31
## $ compliment_writer  <dbl> 24
## $ cool               <dbl> 1
## $ elite_count        <dbl> 7
## $ fans               <dbl> 21
## $ friends_count      <dbl> 25
## $ funny              <dbl> 7
## $ review_count_usr   <dbl> 16
## $ useful             <dbl> 32
## $ year_since         <dbl> 2010
## $ tips_counter       <dbl> 15
## $ total_compliments  <dbl> 24
## $ cluster_usr        <dbl> 8
# seleção aleatória de estabelecimentos e notas atribuídas a cada um baseado no número de reviews

n_reviews <- user$review_count_usr

reviewed_usr <- tibble(business_id = sample(yelp_bz_raw$business_id, n_reviews), #seleçao aleatória de estabelecimentos
                                            stars_rv = ceiling(runif(n_reviews, 1.0, 5)),
                                            year_rv = ceiling(runif(n_reviews, 2009, 2019)),
                                            )
user_hist <- user %>% 
            bind_rows(replicate(n_reviews-1, user, simplify = FALSE)) %>% #replica as informações do usuário
            bind_cols(reviewed_usr) %>% #junta os estabelecimentos e notas dadas
            left_join(., yelp_bz_raw, by = 'business_id') #junta as informações dos estabelecimentos

glimpse(user_hist)
## Rows: 16
## Columns: 63
## $ user_id                    <chr> "random_user", "random_user", "random_user…
## $ average_stars              <dbl> 4.07, 4.07, 4.07, 4.07, 4.07, 4.07, 4.07, …
## $ compliment_cool            <dbl> 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14…
## $ compliment_cute            <dbl> 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17…
## $ compliment_funny           <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ compliment_hot             <dbl> 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14…
## $ compliment_list            <dbl> 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13…
## $ compliment_more            <dbl> 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19…
## $ compliment_note            <dbl> 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37…
## $ compliment_photos          <dbl> 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49…
## $ compliment_plain           <dbl> 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46…
## $ compliment_profile         <dbl> 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31…
## $ compliment_writer          <dbl> 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24…
## $ cool                       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ elite_count                <dbl> 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, …
## $ fans                       <dbl> 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21…
## $ friends_count              <dbl> 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25…
## $ funny                      <dbl> 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, …
## $ review_count_usr           <dbl> 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16…
## $ useful                     <dbl> 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32…
## $ year_since                 <dbl> 2010, 2010, 2010, 2010, 2010, 2010, 2010, …
## $ tips_counter               <dbl> 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15…
## $ total_compliments          <dbl> 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24…
## $ cluster_usr                <dbl> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, …
## $ business_id                <chr> "r-kj-kBSKFKh0sM8EVX8AA", "H7rpWv02D6WTu6I…
## $ stars_rv                   <dbl> 2, 4, 5, 4, 3, 5, 3, 3, 5, 5, 2, 5, 3, 5, …
## $ year_rv                    <dbl> 2016, 2011, 2016, 2018, 2019, 2014, 2011, …
## $ categories                 <chr> "Accessories, Women's Clothing, Men's Clot…
## $ latitude                   <dbl> 43.64127, 43.65943, 43.77336, 43.64869, 43…
## $ longitude                  <dbl> -79.43377, -79.38252, -79.49302, -79.38544…
## $ name                       <chr> "Frances Watson", "Bed Bath and Beyond", "…
## $ review_count               <dbl> 3, 28, 41, 3, 4, 4, 16, 11, 3, 42, 351, 44…
## $ stars                      <dbl> 5.0, 2.5, 3.0, 3.0, 4.0, 2.0, 4.0, 2.5, 2.…
## $ AcceptsInsurance           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ AgesAllowed                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Alcohol                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, …
## $ BYOB                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ BikeParking                <dbl> 2, 2, 2, 0, 2, 0, 0, 2, 0, 2, 2, 2, 0, 0, …
## $ BusinessAcceptsCreditCards <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, …
## $ ByAppointmentOnly          <dbl> 1, 0, 0, 0, 1, 0, 0, 2, 0, 1, 0, 0, 0, 1, …
## $ Caters                     <dbl> 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, …
## $ CoatCheck                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Corkage                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ DogsAllowed                <dbl> 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ DriveThru                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ GoodForDancing             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ GoodForKids                <dbl> 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 1, 0, 2, 0, …
## $ HappyHour                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, …
## $ HasTV                      <dbl> 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, …
## $ NoiseLevel                 <dbl> 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, …
## $ OutdoorSeating             <dbl> 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, …
## $ RestaurantsAttire          <dbl> 0, 0, 1, 0, 1, 0, 3, 3, 0, 0, 1, 0, 0, 0, …
## $ RestaurantsDelivery        <dbl> 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, …
## $ RestaurantsGoodForGroups   <dbl> 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, …
## $ RestaurantsPriceRange2     <dbl> 3, 2, 2, 0, 2, 0, 0, 3, 0, 1, 3, 2, 0, 0, …
## $ RestaurantsReservations    <dbl> 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 2, 0, 1, 0, …
## $ RestaurantsTableService    <dbl> 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, …
## $ RestaurantsTakeOut         <dbl> 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, …
## $ Smoking                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ WheelchairAccessible       <dbl> 1, 0, 0, 0, 2, 0, 2, 1, 0, 0, 0, 2, 0, 0, …
## $ WiFi                       <dbl> 0, 0, 1, 0, 1, 0, 3, 3, 0, 0, 1, 0, 0, 0, …
## $ tips_counter_bz            <dbl> 0, 6, 7, 0, 4, 2, 0, 0, 1, 1, 37, 3, 4, 0,…
## $ total_compliments_bz       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

Função para recomendação

recomm_f <- function(user, reviewed_usr){
  
  to_go <- yelp_raw %>% 
    filter(stars_rv >= 4) %>% 
    filter(cluster_usr == user$cluster_usr) %>% 
    #filter(cluster_usr == user$cluster) %>% 
    select(business_id) %>% 
    distinct() 
  
  n_go <- nrow(to_go)
  
  #filtra todos os estabelecimentos do cluster do usuário e junta as informações para modelagem
  to_review <- user %>% 
            bind_rows(replicate(n_go-1, user, simplify = FALSE)) %>% #replica as informações do usuário
            bind_cols(to_go) %>% #junta os estabelecimentos e notas dadas
            left_join(., yelp_bz_raw, by = 'business_id')
  
  #prepara a base para o modelo
  user_x_test <- to_review %>% 
          select_if(is.numeric) %>% 
          #select(-stars_rv) %>% 
          scale(center = mean, scale = std) %>% 
          as.matrix()

  #aplica a base no modelo
  predictions <- as_tibble(predict(yelp_nn, user_x_test))
  
  
  #seleciona as principais recomendações
  recommendation <- to_review %>% 
    bind_cols(pred = predictions) %>% 
    anti_join(., reviewed_usr, by = 'business_id') %>% 
    filter(V1 > 0.5)

}

Recomendação para usuário criado

Usuário aleatório da base

Para validar as recomendações, é feito o teste também com um usuário aleatório da base de teste.

n <- ceiling(runif(1,1,nrow(yelp_test)))


(random_user <- yelp_raw[n,]$user_id)
## [1] "UuH7pyPsm4E5bDfXaQJ9dg"
user2 <- yelp_usr_cluster %>% 
            filter(user_id == random_user)

reviewed_usr2 <- yelp_raw %>% 
  filter(user_id == random_user)

rec_user <- recomm_f(user2,reviewed_usr2)

Recomendação para usuário da base

#top 5 recomendações

  rec_user %>% 
    top_n(5, V1) %>%
    arrange(V1) %>% 
    mutate(rank = as.factor(row_number())) %>% 
    ggplot(aes(x = V1, y = name)) +
    geom_col() +
    labs(x = "Probabilidade de avaliação positiva",
    y = 'Recomendação')

top_5 <- rec_user %>% 
    top_n(5, V1) %>%
    arrange(-V1) %>% 
    mutate(rank = as.factor(row_number()))

top_5 %>% 
  select(name, categories, V1)
## # A tibble: 5 x 3
##   name                categories                                              V1
##   <chr>               <chr>                                                <dbl>
## 1 Duotherapy          Physical Therapy, Health & Medical, Massage Therapy  0.973
## 2 Helping Hands Doula Sleep Specialists, Health & Medical, Doulas, Massag… 0.972
## 3 Yonge Elmwood Phar… Health & Medical, Pharmacy                           0.972
## 4 Nancy Bishay, DDS   Orthodontists, Dentists, Oral Surgeons, General Den… 0.971
## 5 Motion Care Massag… Chiropractors, Health & Medical, Massage Therapy, P… 0.971
qmplot(longitude, latitude, data = top_5, 
       maptype = "toner-background", 
       color = rank,
       size = V1)
## Using zoom = 13...
## Source : http://tile.stamen.com/terrain/13/2288/2984.png
## Source : http://tile.stamen.com/terrain/13/2289/2984.png
## Source : http://tile.stamen.com/terrain/13/2290/2984.png
## Source : http://tile.stamen.com/terrain/13/2291/2984.png
## Source : http://tile.stamen.com/terrain/13/2288/2985.png
## Source : http://tile.stamen.com/terrain/13/2289/2985.png
## Source : http://tile.stamen.com/terrain/13/2290/2985.png
## Source : http://tile.stamen.com/terrain/13/2291/2985.png
## Source : http://tile.stamen.com/terrain/13/2288/2986.png
## Source : http://tile.stamen.com/terrain/13/2289/2986.png
## Source : http://tile.stamen.com/terrain/13/2290/2986.png
## Source : http://tile.stamen.com/terrain/13/2291/2986.png

Recomendação por categoria

rec_user %>% 
  select(name, categories, V1) %>% 
  unnest_tokens(category, categories) %>% 
  filter(category %in% c('food','restaurants','bars','pub')) %>% 
  group_by(category) %>%
  mutate(pred_avg = mean(V1)) %>% 
  arrange(desc(V1)) %>% 
  unique() %>% 
  slice(1:5) %>%
  ggplot(aes(V1, name, fill = category)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~category, scales = 'free') +
  scale_x_continuous() +
  scale_y_reordered() +
  labs(x = 'Probabilidade de boa avaliação')

Conclusão

Referências